library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
date()
## [1] "Sun Nov 26 16:16:06 2023"
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Boston")
colnames(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
dim(Boston)
## [1] 506 14
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
Boston_melted <- melt(Boston)
## Using as id variables
ggplot(Boston_melted, aes(x = value)) +
geom_histogram(binwidth = 1, fill = "turquoise", color = "blue") +
facet_wrap(~variable, scales = "free") +
theme_minimal()
\(~\)
\(~\)
cor_matrix <- cor(Boston)
corrplot(cor_matrix, method="circle")
\(~\) ##### Here I can see there are some strong negative corerelations: dis v (indus-nox-age) and mdev v lstat. Also there are some strong positive correlations: mdev v rm (which makes sense as the more rooms you have, ideally, the higher the value of your property), idus v nox (the more industry you have the higher the relative air pollution level).
\(~\)
Boston_scaled <- as.data.frame(scale(Boston))
summary(Boston_scaled)
## crim zn indus chas
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563 Min. :-0.2723
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668 1st Qu.:-0.2723
## Median :-0.390280 Median :-0.48724 Median :-0.2109 Median :-0.2723
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150 3rd Qu.:-0.2723
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202 Max. : 3.6648
## nox rm age dis
## Min. :-1.4644 Min. :-3.8764 Min. :-2.3331 Min. :-1.2658
## 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366 1st Qu.:-0.8049
## Median :-0.1441 Median :-0.1084 Median : 0.3171 Median :-0.2790
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059 3rd Qu.: 0.6617
## Max. : 2.7296 Max. : 3.5515 Max. : 1.1164 Max. : 3.9566
## rad tax ptratio black
## Min. :-0.9819 Min. :-1.3127 Min. :-2.7047 Min. :-3.9033
## 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876 1st Qu.: 0.2049
## Median :-0.5225 Median :-0.4642 Median : 0.2746 Median : 0.3808
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058 3rd Qu.: 0.4332
## Max. : 1.6596 Max. : 1.7964 Max. : 1.6372 Max. : 0.4406
## lstat medv
## Min. :-1.5296 Min. :-1.9063
## 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 3.5453 Max. : 2.9865
bins <- quantile(Boston_scaled$crim)
# create a categorical variable 'crime'
crime <- cut(Boston_scaled$crim, breaks = bins,labels = c("low","med_low","med_high","high"), include.lowest = TRUE)
# remove original crim from the dataset
Boston_scaled <- dplyr::select(Boston_scaled, -crim)
# add the new categorical value to scaled data
Boston_scaled <- data.frame(Boston_scaled, crime)
summary(Boston_scaled)
## zn indus chas nox
## Min. :-0.48724 Min. :-1.5563 Min. :-0.2723 Min. :-1.4644
## 1st Qu.:-0.48724 1st Qu.:-0.8668 1st Qu.:-0.2723 1st Qu.:-0.9121
## Median :-0.48724 Median :-0.2109 Median :-0.2723 Median :-0.1441
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.04872 3rd Qu.: 1.0150 3rd Qu.:-0.2723 3rd Qu.: 0.5981
## Max. : 3.80047 Max. : 2.4202 Max. : 3.6648 Max. : 2.7296
## rm age dis rad
## Min. :-3.8764 Min. :-2.3331 Min. :-1.2658 Min. :-0.9819
## 1st Qu.:-0.5681 1st Qu.:-0.8366 1st Qu.:-0.8049 1st Qu.:-0.6373
## Median :-0.1084 Median : 0.3171 Median :-0.2790 Median :-0.5225
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4823 3rd Qu.: 0.9059 3rd Qu.: 0.6617 3rd Qu.: 1.6596
## Max. : 3.5515 Max. : 1.1164 Max. : 3.9566 Max. : 1.6596
## tax ptratio black lstat
## Min. :-1.3127 Min. :-2.7047 Min. :-3.9033 Min. :-1.5296
## 1st Qu.:-0.7668 1st Qu.:-0.4876 1st Qu.: 0.2049 1st Qu.:-0.7986
## Median :-0.4642 Median : 0.2746 Median : 0.3808 Median :-0.1811
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.5294 3rd Qu.: 0.8058 3rd Qu.: 0.4332 3rd Qu.: 0.6024
## Max. : 1.7964 Max. : 1.6372 Max. : 0.4406 Max. : 3.5453
## medv crime
## Min. :-1.9063 low :127
## 1st Qu.:-0.5989 med_low :126
## Median :-0.1449 med_high:126
## Mean : 0.0000 high :127
## 3rd Qu.: 0.2683
## Max. : 2.9865
# Test and train data sets:
n <- nrow(Boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
# create train set
train <- Boston_scaled[ind,]
# create test set
test <- Boston_scaled[-ind,]
\(~\)
# linear discriminant analysis
lda.fit <- lda(crime ~ ., data = train)
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2549505 0.2425743 0.2524752 0.2500000
##
## Group means:
## zn indus chas nox rm age
## low 0.95434742 -0.9258605 -0.11943197 -0.8741718 0.45741131 -0.8706537
## med_low -0.08231376 -0.2915812 -0.03128211 -0.5969715 -0.10157606 -0.4512961
## med_high -0.36785679 0.1645982 0.15226017 0.4166923 0.09827811 0.4208230
## high -0.48724019 1.0171306 -0.15538550 1.0045416 -0.47250601 0.8098159
## dis rad tax ptratio black lstat
## low 0.8845325 -0.6886218 -0.7567361 -0.37768596 0.3791635 -0.78111842
## med_low 0.4035369 -0.5529539 -0.5132547 -0.01151131 0.3234892 -0.19001948
## med_high -0.3735685 -0.4053858 -0.2987757 -0.30822868 0.1283773 -0.01850998
## high -0.8573713 1.6379981 1.5139626 0.78062517 -0.8596600 0.96934948
## medv
## low 0.537232159
## med_low 0.003866546
## med_high 0.161872943
## high -0.740561820
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.07739015 0.70485482 -0.94757111
## indus 0.03442359 -0.28585326 0.59355289
## chas -0.09986824 -0.03341233 0.09490689
## nox 0.30265239 -0.73971012 -1.25654089
## rm -0.10343239 -0.11362586 -0.05313860
## age 0.25622484 -0.35410860 -0.31036464
## dis -0.08265281 -0.30651411 0.19906543
## rad 3.18424483 0.88896601 0.06549547
## tax 0.01258437 -0.05088693 0.15554547
## ptratio 0.08250346 0.12639893 -0.28012998
## black -0.12817334 -0.02200587 0.10961806
## lstat 0.26121936 -0.16069426 0.43245625
## medv 0.19022990 -0.34641337 -0.21014934
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9487 0.0377 0.0136
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
graphics::arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(train$crime)
# plot the lda (bi)plot
plot(lda.fit, dimen = 2)
lda.arrows(lda.fit, myscale = 1)
# save the correct classes from test data
correct_classes <- test$crime
# remove the crime variable from test data
test <- dplyr::select(test, -crime)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 14 10 0 0
## med_low 4 13 11 0
## med_high 0 7 16 1
## high 0 0 0 26
\(~\) ##### Looking at model it performs well on the high crime prediction, but the low, med_low and med_high seem to be off slightly. med_low is the next best predicted crime level.
data("Boston")
Boston_scaled <- as.data.frame(scale(Boston))
dist_eu <- dist(Boston_scaled, method = "euclidean")
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
#dist_man <- dist(Boston_scaled, method = "manhattan")
#dist_man
set.seed(123)
# determine the number of clusters
k_max <- 10
# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
\(~\) ##### Looking at the elbow it appears that 2 is the optimal number of clusters for k means
# k-means clustering
km <- kmeans(Boston, centers = 2)
# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)
##### I have no idea how to interpret this-
#Data
set.seed(32)
data("Boston")
boston_scaled <- scale(Boston) %>% as.data.frame()
#Set seven clusters:
km_bonus <- kmeans(Boston, centers = 8)
boston_scaled$cluster <- km_bonus$cluster
Boston_bonus <- lda(cluster ~ ., data = boston_scaled)
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
graphics::arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(boston_scaled$cluster)
# plot the lda (bi)plot
plot(Boston_bonus, dimen =2)
lda.arrows(Boston_bonus, myscale = 1)
Boston_bonus
## Call:
## lda(cluster ~ ., data = boston_scaled)
##
## Prior probabilities of groups:
## 1 2 3 4 5 6 7
## 0.27075099 0.08695652 0.15217391 0.12252964 0.10869565 0.05335968 0.18379447
## 8
## 0.02173913
##
## Group means:
## crim zn indus chas nox rm
## 1 1.0097765 -0.4872402 1.0662784 -0.04242540 0.9959393 -0.39626518
## 2 -0.4112593 1.8232528 -1.0587789 -0.27232907 -1.2032247 0.32608884
## 3 -0.4103240 0.4669153 -0.6939781 0.08558914 -0.7573615 0.40725570
## 4 -0.3115695 -0.4872402 0.6973678 0.10868063 0.6498658 -0.24065169
## 5 -0.3906267 -0.1882696 -0.4589025 -0.05757815 -0.6526048 0.01656427
## 6 -0.4105500 0.7276120 -0.7660389 -0.27232907 -0.8389328 0.12107492
## 7 -0.3726011 -0.2488802 -0.5624188 0.15101504 -0.2307439 0.26980542
## 8 -0.1918628 -0.4872402 0.8121161 0.08558914 1.3206359 -0.52452958
## age dis rad tax ptratio black
## 1 0.7599946 -0.82659650 1.5757732 1.53915759 0.8040926 -0.7189340
## 2 -1.4995727 1.76550267 -0.6321108 -0.51640025 -0.7080115 0.3436424
## 3 -0.6511612 0.50825445 -0.6835681 -1.09300067 -0.2158123 0.3816348
## 4 0.7665217 -0.74688501 -0.5298939 0.01868990 -0.2268036 0.2441186
## 5 -1.0126162 0.40445826 -0.5725993 -0.71589771 -0.1533051 0.3386187
## 6 -1.1910518 0.90003854 -0.5735274 -0.05678564 -0.3404312 0.3375271
## 7 0.4650140 0.04164319 -0.5323637 -0.67571060 -0.2506439 0.3386363
## 8 0.8257272 -0.69874374 -0.5538062 -0.12654817 -0.6723188 -1.8525431
## lstat medv
## 1 0.8432167 -0.6807081
## 2 -0.8161977 0.3320129
## 3 -0.5638615 0.6660396
## 4 0.2594944 -0.1294832
## 5 -0.5252045 0.2809099
## 6 -0.5717207 0.2900036
## 7 -0.1610779 0.2011491
## 8 0.6385135 -0.5996044
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4 LD5
## crim 0.009913887 -0.137609161 -0.04036563 0.06710828 0.008706277
## zn -0.063869885 -0.359539354 -0.03252327 -0.37386866 -0.985470185
## indus -0.064553782 -0.195044951 -0.33201434 0.87866138 -0.853101757
## chas 0.051357463 0.035927002 -0.01879004 -0.06102304 0.054091077
## nox -0.064270060 0.327990930 0.43933045 0.36992087 -0.682424547
## rm -0.031831393 0.027532509 -0.07621914 -0.14238990 0.106374027
## age -0.470229341 1.450265293 -0.23689536 -1.46202224 -0.271967600
## dis 0.361412866 -0.313512208 0.20731668 -0.62537375 -0.395082557
## rad -1.643884814 -0.791845574 -3.25137506 -0.13663787 -0.481829789
## tax -9.099980888 -0.327782726 3.36875714 -0.64786007 1.410249515
## ptratio -0.008405428 0.060541663 -0.29843909 0.19223768 -0.102382405
## black 0.023126886 -0.089392096 -0.06586979 -0.19721961 0.616762841
## lstat -0.032547271 -0.183432824 -0.09333305 0.15316806 0.091791088
## medv -0.062060374 0.005458806 -0.02837714 0.34842936 -0.056421542
## LD6 LD7
## crim 0.098841166 0.03527449
## zn 0.842438878 0.35861794
## indus 0.904577426 -0.95614725
## chas -0.048494403 0.02316471
## nox -0.321696150 0.19808743
## rm 0.079509917 -0.04629482
## age 0.130244689 -0.20378571
## dis -0.476692936 -0.65750487
## rad 0.001318226 -1.00217642
## tax -0.281439177 0.85946009
## ptratio 0.336896632 0.79163091
## black 0.782771708 -0.43714271
## lstat 0.104232162 0.73179612
## medv 0.242827431 1.10579041
##
## Proportion of trace:
## LD1 LD2 LD3 LD4 LD5 LD6 LD7
## 0.9703 0.0155 0.0084 0.0029 0.0018 0.0009 0.0002
####Super Bonus
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:reshape':
##
## rename
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers',color = train$crime)